Example report:
Number of input reads | 93371899
Average input read length | 152
UNIQUE READS:
Uniquely mapped reads number | 58118693
Uniquely mapped reads % | 62.24%
Average mapped length | 133.52
Number of splices: Total | 9287778
Number of splices: Annotated (sjdb) | 0
Number of splices: GT/AG | 9198064
Number of splices: GC/AG | 62090
Number of splices: AT/AC | 5194
Number of splices: Non-canonical | 22430
Mismatch rate per base, % | 2.03%
Deletion rate per base | 0.01%
Deletion average length | 1.43
Insertion rate per base | 0.00%
Insertion average length | 1.43
MULTI-MAPPING READS:
Number of reads mapped to multiple loci | 2779036
% of reads mapped to multiple loci | 2.98%
Number of reads mapped to too many loci | 36707
% of reads mapped to too many loci | 0.04%
UNMAPPED READS:
Number of reads unmapped: too many mismatches | 0
% of reads unmapped: too many mismatches | 0.00%
Number of reads unmapped: too short | 31910266
% of reads unmapped: too short | 34.18%
Number of reads unmapped: other | 527197
% of reads unmapped: other | 0.56%
CHIMERIC READS:
Number of chimeric reads | 65461
% of chimeric reads | 0.07%
library(ggplot2)
library(UpSetR)
setwd('~/git/recount-pump/metadata/qc_expt')
This is a subsample, taking a random 20% of the total # rows from samples.tsv. This way of sampling is flawed because it does not keep studies intact.
I think that most of the STAR-related QC measures come from the Log.final.out output file.
m <- read.table('samples.sorted_by_study.tsv', sep='\t', comment.char='', quote='', header=T)
frac_non <- m$star.number_of_splices._non.canonical / m$star.number_of_splices._total
frac_non_thresh <- frac_non > 0.4
ggplot(m, aes(x=1:nrow(m), y=frac_non, color=frac_non > 0.4, alpha=I(0.2))) +
geom_point() +
theme_bw()
## Warning: Removed 514 rows containing missing values (geom_point).
frac_non_samps <- which(frac_non_thresh)
ggplot(m, aes(y=frac_non)) + geom_boxplot() + theme_bw()
## Warning: Removed 514 rows containing non-finite values (stat_boxplot).
short_frac <- m$star.number_of_reads_unmapped._too_short / m$star.number_of_input_reads
short_frac_thresh <- short_frac > 0.8
ggplot(m, aes(x=1:nrow(m), y=short_frac, color=short_frac > 0.8, alpha=I(0.05))) +
geom_point() +
theme_bw()
short_frac_samps <- which(short_frac_thresh)
assigned_frac <- m$gene_fc_count_all.assigned/m$gene_fc_count_all.total
assigned_frac_thresh <- assigned_frac < 0.05
ggplot(m, aes(x=1:nrow(m), y=assigned_frac, color=assigned_frac < 0.05, alpha=I(0.05))) +
geom_point() + theme_bw() + labs(x='Index', y='% assigned to gene')
## Warning: Removed 144 rows containing missing values (geom_point).
assigned_frac_samps <- which(assigned_frac_thresh)
nreads <- (m$star.number_of_input_reads + m$star.number_of_input_reads2)
nmapped <- (m$star.all_mapped_reads + m$star.all_mapped_reads2)
mapped <- nmapped / nreads
mapped_thresh <- mapped < 0.01
mapped_samps <- which(mapped_thresh)
ggplot(m, aes(x=1:nrow(m), y=mapped, color=mapped < 0.01, alpha=I(0.5))) +
geom_point() + theme_bw() + labs(x='Index', y='% mapped')
nrep <- (m$star.number_of_reads_mapped_to_multiple_loci + m$star.number_of_reads_mapped_to_multiple_loci2)
repal <- nrep / nmapped
repal_thresh <- repal > 0.85
repal_samps <- which(repal_thresh)
ggplot(m, aes(x=1:nrow(m), y=repal, color=repal > 0.85, alpha=I(0.5))) +
geom_point() + theme_bw() + labs(x='Index', y='Fraction of aligned reads that aligned repetitively')
## Warning: Removed 141 rows containing missing values (geom_point).
mt <- m$aligned_reads..chrm
mt_thresh <- mt > 40
mt_samps <- which(mt_thresh)
ggplot(m, aes(x=1:nrow(m), y=mt, color=mt > 40, alpha=I(0.5))) +
geom_point() + theme_bw() + labs(x='Index', y='% mapped to MT')
few <- m$star.number_of_input_reads + m$star.number_of_input_reads2
few_thresh <- few < 10000
few_samps <- which(few_thresh)
ggplot(m, aes(x=1:nrow(m), y=few, color=few < 10000, alpha=I(0.5))) +
geom_point() + theme_bw() + labs(x='Index', y='# reads')
# percent
pct_chim <- m$star.._of_chimeric_reads
pct_chim_thresh <- pct_chim > 20
ggplot(m, aes(x=1:nrow(m), y=pct_chim, color=pct_chim > 20, alpha=I(0.5))) +
geom_point() + theme_bw() + labs(x='Index', y='% chimeric')
pct_chim_samps <- which(pct_chim_thresh)
read_len <- m$star.average_input_read_length
read_len_thresh <- read_len <= 30
ggplot(m, aes(x=1:nrow(m), y=read_len, color=read_len <= 30, alpha=I(0.5))) +
geom_point() +
theme_bw()
read_len_samps <- which(read_len_thresh)
table(m$study_acc[read_len < 27])
##
## DRP002623 ERP021735 SRP000762 SRP002915 SRP003726 SRP004042 SRP004847 SRP004965
## 7 1 7 27 14 3 8 3
## SRP005279 SRP008218 SRP009247 SRP014830 SRP017786 SRP029889 SRP032279 SRP039684
## 36 14 14 1 6 25 119 3
## SRP041993 SRP055513 SRP058476 SRP061466 SRP065282 SRP068551 SRP098746 SRP100634
## 3 111 4 16 1 18 2 21
## SRP103771 SRP104148 SRP108951 SRP115598 SRP117282 SRP119821 SRP125173 SRP130953
## 2 1 16 4 1 1 1 3
## SRP132520 SRP137150 SRP140829 SRP168237 SRP170234 SRP183467 SRP188526 SRP189162
## 4 1 6 4 10 18 80 16
## SRP193787 SRP198932 SRP199640 SRP200058 SRP218935
## 12 2 12 8 1
indel <- m$star.deletion_rate_per_base + m$star.insertion_rate_per_base
indel_thresh <- indel > 0.1
ggplot(m, aes(x=1:nrow(m), y=indel, color=indel > 0.1)) +
geom_point() + theme_bw() + labs(x='Index', y='Insertion + deletion rate')
indel_samps <- which(indel_thresh)
Trying to identify who that green tower is.
table(m$study_acc[indel > 2])
##
## SRP061888 SRP073426 SRP166108
## 8 12 70
I was expecting the mismatch rate per-base measure to be a fraction, but I’m clearly misunderstanding. This one needs more work.
mm_rate <- m$star.mismatch_rate_per_base._.
mm_rate_mean <- mean(mm_rate)
mm_rate_sd <- sd(mm_rate)
mm_rate_thresh <- mm_rate > mm_rate_mean + (2*mm_rate_sd)
ggplot(m, aes(x=1:nrow(m), y=mm_rate, color=mm_rate > mm_rate_mean + (2*mm_rate_sd), alpha=I(0.3))) +
geom_point() + theme_bw() + labs(x='Index', y='Mismatch rate (%) per base')
mm_rate_samps <- which(mm_rate_thresh)
Why is this ever negative?
sc_avg <- (m$star.average_input_read_length + m$star.average_input_read_length2 - m$star.average_mapped_length - m$star.average_mapped_length2) / (m$star.average_input_read_length + m$star.average_input_read_length2)
sc_avg_thresh <- sc_avg > 0.15
ggplot(m, aes(x=1:nrow(m), y=sc_avg, color=sc_avg > 0.15)) +
geom_point() + theme_bw() + labs(x='Index', y='Avg proportion soft clipped')
sc_avg_samps <- which(sc_avg_thresh)
table(m$study_acc[sc_avg > 0.95])
##
## DRP000499 ERP021420 SRP001371 SRP028887 SRP035934 SRP040110 SRP064863 SRP073813
## 1 1 1 4 2 1 11 1
## SRP093667 SRP100634 SRP114956 SRP128731 SRP144016 SRP150087 SRP153798 SRP155564
## 1 21 1 1 2 1 2 2
## SRP168237 SRP170234 SRP189162 SRP193787 SRP194624 SRP199640 SRP223635
## 4 10 16 12 33 12 15
other <- m$star.number_of_reads_unmapped._other / m$star.number_of_input_reads
other_thresh <- other > 0.1
other_samps <- which(other_thresh)
ggplot(m, aes(x=1:nrow(m), y=other, color=other > 0.1)) +
geom_point() + theme_bw() + labs(x='Index', y='Fraction reads unmapped for "other" reason')
table(m$study_acc[other_thresh])
##
## DRP000499 DRP002623 ERP009114 ERP021735 ERP104024 ERP111307 ERP115036 ERP116477
## 1 2 1 2 12 2 52 2
## SRP000762 SRP001371 SRP002079 SRP002640 SRP003726 SRP004042 SRP004847 SRP004965
## 2 1 1 3 1 1 2 1
## SRP005177 SRP006475 SRP007417 SRP007596 SRP008218 SRP009276 SRP010279 SRP011085
## 1 1 1 1 1 3 1 4
## SRP012062 SRP012557 SRP013239 SRP013450 SRP013565 SRP014591 SRP017411 SRP017575
## 1 1 1 1 1 1 1 52
## SRP018861 SRP019272 SRP019994 SRP021191 SRP022920 SRP023262 SRP024674 SRP028170
## 1 1 98 10 6 76 1 4
## SRP028180 SRP029953 SRP032455 SRP033276 SRP034953 SRP036053 SRP036821 SRP040472
## 4 4 3 9 6 7 43 73
## SRP041833 SRP042161 SRP044956 SRP055513 SRP056802 SRP061707 SRP062144 SRP063500
## 1 1 4 1 1 1 8 1
## SRP067502 SRP067524 SRP068609 SRP068969 SRP073940 SRP076107 SRP081236 SRP082426
## 11 2 11 11 1 1 3 7
## SRP083756 SRP094435 SRP095307 SRP096338 SRP097912 SRP100634 SRP101731 SRP102077
## 1 1 2 2 1 21 4 5
## SRP106148 SRP107789 SRP110081 SRP111459 SRP112553 SRP113558 SRP114410 SRP114672
## 1 9 6 1 4 1 1 1
## SRP117905 SRP121075 SRP131142 SRP132701 SRP133108 SRP133393 SRP144189 SRP144382
## 3 1 6 4 1 1 2 6
## SRP144647 SRP150087 SRP150331 SRP150456 SRP151471 SRP155771 SRP157907 SRP163252
## 2 1 2 1 1 1 2 1
## SRP163259 SRP168237 SRP168244 SRP170234 SRP173190 SRP173497 SRP174449 SRP174485
## 11 4 19 10 4 4 24 2
## SRP175016 SRP183467 SRP183811 SRP186406 SRP186450 SRP186455 SRP189116 SRP189162
## 1 6 2 2 5 1 1 16
## SRP189867 SRP193787 SRP193940 SRP194624 SRP197074 SRP198244 SRP198803 SRP199640
## 2 12 1 53 2 22 2 12
## SRP219272 SRP219652 SRP221385
## 1 2 2
juncs_per_base <- m$junction_count / (m$star.all_mapped_reads * m$star.average_mapped_length)
juncs_per_base_thresh <- juncs_per_base < 0.000001
ggplot(m, aes(x=1:nrow(m), y=juncs_per_base, color=juncs_per_base < 0.000001, alpha=I(0.1))) +
geom_point() + theme_bw() + labs(x='Index', y='Junctions per aligned base') + ylim(0, 0.00001)
## Warning: Removed 296275 rows containing missing values (geom_point).
juncs_per_base_samps <- which(juncs_per_base_thresh)
upset(fromList(list(juncs_per_base=juncs_per_base_samps,
mm_rate=mm_rate_samps,
indel=indel_samps,
read_len=read_len_samps,
pct_chim=pct_chim_samps,
assigned_frac=assigned_frac_samps,
frac_non=frac_non_samps,
short_frac=short_frac_samps,
mapped=mapped_samps,
few_reads=few_samps,
mt_pct=mt_samps,
rep_al=repal_samps,
unmapped_other=other_samps)), nsets = 13, order.by = "freq")